1. Data import

In [1]:
# Import project py file
from life_analysis import *
# Import general libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
from textwrap import wrap
import warnings
warnings.filterwarnings('ignore')
# Import modelling libraries
import scipy.stats as stats
from itertools import combinations
from sklearn.preprocessing import StandardScaler, scale, Normalizer, PolynomialFeatures, MinMaxScaler
from sklearn.linear_model import LinearRegression, Lasso, Ridge, LassoCV, LassoLarsCV, LassoLarsIC
from sklearn.model_selection import cross_val_score, KFold, train_test_split
from sklearn.metrics import mean_squared_error
import statsmodels.api as sm
from statsmodels.formula.api import ols
from statsmodels.stats.outliers_influence import variance_inflation_factor
In [2]:
# Import raw data
raw = pd.read_csv('analytic_data2019.csv', skiprows=[1,2])
display(raw.shape)
display(raw.head())
(3193, 534)
State FIPS Code County FIPS Code 5-digit FIPS Code State Abbreviation Name Release Year County Ranked (Yes=1/No=0) Premature death raw value Premature death numerator Premature death denominator ... Male population 18-44 raw value Male population 45-64 raw value Male population 65+ raw value Total male population raw value Female population 0-17 raw value Female population 18-44 raw value Female population 45-64 raw value Female population 65+ raw value Total female population raw value Population growth raw value
0 1 0 1000 AL Alabama 2019 NaN 9917.232898 80440.0 13636816.0 ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
1 1 1 1001 AL Autauga County 2019 1.0 8824.057123 815.0 156132.0 ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
2 1 3 1003 AL Baldwin County 2019 1.0 7224.632160 2827.0 576496.0 ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
3 1 5 1005 AL Barbour County 2019 1.0 9586.165037 451.0 72222.0 ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
4 1 7 1007 AL Bibb County 2019 1.0 11783.543675 445.0 63653.0 ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN

5 rows × 534 columns

2. Data Cleansing

2.1 Clean-up columns and data

In [3]:
# Remove unnecessary columns that contain these text
filter_out = ['numerator', 'denominator', 'CI low', 'CI high', '%', 'death',
              'percentage', 'Percentage', 'ratio', 'Ratio', 'mortality']

# Clean up characters from column names
replace_dict = {' raw value':'', ' - ':'_', '-':'_', '=':'', '/':'_',
                '(':'', ')':'', '.':'', '+':' above', ' ':'_'}

# Manually drop irrelevant columns
drop_list = ['Uninsured_adults', 'Uninsured_children', 'Population']

# Run cleansing function from life_analysis.py
df = data_cleansing(raw, filter_out, replace_dict, drop_list)
df = df.reset_index(drop=True)

# Assign a copy of cleansed df for mapping
df0 = df.copy()

# Basis of df to use for modelling. Skip columns 0-6 which are non-numerical
df = df.drop(df.iloc[:, 0:7], axis = 1)
cols = list(df.columns)
df.shape
Out[3]:
(3142, 48)

2.2 Remove outliers

In [4]:
# Remove outliers that lies k std deviations away from mean
k = 3.5
# Run remove outlier function from life_analysis.py
df = remove_outliers(df, k)
13.5% of rows removed
2719 rows remains
In [5]:
# Run log transformation function from life_analysis.py
target = 'Life_expectancy'
df = log_transform(df, target)

3. Data Exploration (EDA)

3.1 Overview of data via plots

In [6]:
# Plot all cleansed columns in df using:
# 1. Scatter plot to view relationship vs. target
# 2. Histogram to view spread of data set

# Run plot function from life_analysis.py
num_of_plots_per_row = 6
plot_all(df, num_of_plots_per_row, target)

3.2 Overview of target variable : Life Expectancy

In [7]:
# Plot heatmap by US county
import plotly.figure_factory as ff
values = df0['Life_expectancy'].tolist()
fips = df0['5_digit_FIPS_Code'].tolist()
colorscale = ['#ff0000', '#fe8000', '#e5aa00', '#b7b600', '#75a700', '#008000']
fig = ff.create_choropleth(
    fips=fips, values=values, scope=['USA'],
    binning_endpoints=[70,75,77,79,84], colorscale=colorscale,
    county_outline={'color': 'rgb(255,255,255)', 'width': 0}, round_legend_values=True,
    legend_title='Life Expectancy by US County')
fig.layout.template = None
fig.show()

3.3 Split training and test data

In [8]:
# Split parameters
seed = 1
test_size = 1000/df.shape[0]

# Split target and predictors
y_all = df['Life_expectancy']
x_all = df.drop('Life_expectancy', axis=1)

# Split 80/20 for training/testing 
x_train, x_test, y_train, y_test = train_test_split(x_all, y_all, test_size=test_size, random_state=seed)

# Fit to train set
scaler = StandardScaler()
scaler.fit(x_train)

# Assign and transform training data
x_train = pd.DataFrame(scaler.transform(x_train), columns=x_all.columns) # transform training data
y_train = (pd.DataFrame(y_train)).reset_index(drop=True)
dftrain = pd.concat([y_train, x_train], axis=1)

# Assign and transform test data
x_test = pd.DataFrame(scaler.transform(x_test), columns=x_all.columns) # transform training data
y_test = (pd.DataFrame(y_test)).reset_index(drop=True)

4. Feature Selection (Part 1): Evaluate predictors

4.1 Baseline model : calculate k-fold cv with cleansed predictors

In [9]:
# Cross validation using all columns
regression = LinearRegression()
crossvalidation = KFold(n_splits=5, shuffle=True, random_state=seed)
baseline_rsq = np.mean(cross_val_score(regression, x_train, y_train, scoring='r2', cv=crossvalidation))
baseline_mse = -np.mean(cross_val_score(regression, x_train, y_train, cv=5, scoring='neg_mean_squared_error'))
print(f'No of predictors: {x_train.shape[1]}')
print(f'kfold cv r-sq: {round(baseline_rsq,4)}')
print(f'kfold cv MSE : {round(baseline_mse,4)}')
No of predictors: 47
kfold cv r-sq: 0.6992
kfold cv MSE : 1.9451

4.2 Baseline model : investigate regularization using Lasso

In [10]:
model_aic = LassoLarsIC(criterion='aic')
model_aic.fit(x_train, y_train)

lasso = Lasso(alpha = model_aic.alpha_) 
lasso.fit(x_train, y_train)
lasso_rsq = lasso.score(x_train, y_train)
lasso_mse = mean_squared_error(y_train, lasso.predict(x_train))

print(f'No of predictors  : {x_train.shape[1]}')
print(f'Optimum AIC alpha : {round(model_aic.alpha_,4)}')
print(f'After Lasso regularization r-sq: {round(lasso_rsq,4)}')
print(f'After Lasso regularization MSE : {round(lasso_mse,4)}')
No of predictors  : 47
Optimum AIC alpha : 0.0012
After Lasso regularization r-sq: 0.7222
After Lasso regularization MSE : 1.798

4.3 Evaluate predictors (Step 1) : P-value of baseline predictors vs. target (Life expectancy)

In [11]:
# Check p-value using StatsModel (not available in Scikit-learn)

outcome = 'Life_expectancy'
x_cols = list(x_train.columns)
predictors = '+'.join(x_cols)
formula = outcome + '~' + predictors
model = ols(formula=formula, data=dftrain).fit()
summary = model.summary()

pt = summary.tables[1]
pt = pd.DataFrame(pt.data)
pt.columns = pt.iloc[0]
pt = pt.drop(pt.index[:2])
pt = pt[['','P>|t|','coef']]
pt = pt.rename(columns={'':'Predictor'})
pt = pt.rename(columns={'P>|t|':'Pvalue'})
pt['Pvalue'] = pt['Pvalue'].astype(float)
pt['coef'] = pt['coef'].astype(float)
pt['sign'] = pt['coef'].map(lambda row: '+' if row > 0 else '-')
pt['coef'] = pt['coef'].map(lambda row: abs(row))

pt = pt.sort_values(['Pvalue', 'coef'], ascending=[1, 0])
pt = pt[pt['Pvalue'] < 0.05]
print(f'List of {len(pt)} predictors with p-value < 0.05')
display(pt)

list_predictors = list(pt['Predictor'])
List of 21 predictors with p-value < 0.05
Predictor Pvalue coef sign
36 Frequent_physical_distress 0.000 1.4509 +
13 Teen_births 0.000 0.6561 -
19 Mammography_screening 0.000 0.3591 +
6 Adult_smoking 0.000 0.3507 -
35 Long_commute_driving_alone 0.000 0.2172 -
34 Driving_alone_to_work 0.000 0.2064 -
18 Preventable_hospital_stays 0.000 0.1799 -
17 Mental_health_providers 0.000 0.1789 -
9 Physical_inactivity 0.001 0.2192 -
3 Poor_physical_health_days 0.002 0.9692 -
21 High_school_graduation 0.002 0.1215 -
16 Dentists 0.004 0.1401 +
5 Low_birthweight 0.006 0.1437 -
42 Other_primary_care_providers 0.006 0.1190 -
38 Diabetes_prevalence 0.008 0.1923 -
30 Violent_crime 0.009 0.1080 -
43 Median_household_income 0.010 0.2915 +
44 Median_household_income_White 0.015 0.1948 +
37 Frequent_mental_distress 0.022 0.6883 -
39 Food_insecurity 0.035 0.2641 -
14 Uninsured 0.047 0.1109 +

4.4 Evaluate predictors (Step 2) : Correlation of predictors vs. target (Life expectancy)

In [12]:
# Calculate correlation coeff with life expectancy for all predictors
cols = list(dftrain.columns)
corr_list = []

for col in cols:    
    corr = stats.pearsonr(dftrain[col], dftrain.Life_expectancy)    
    corr_list.append([abs(round(corr[0],3)), col])

# Show sorted list of columns and correlations between life expectancy vs. column and log(column)
corr_list = sorted(corr_list, key=lambda i: i[0], reverse=True)
k = 20
print(f'Top {k} Correlation between all predictors vs. Life expectancy:')
display(corr_list[1:k+1])
Top 20 Correlation between all predictors vs. Life expectancy:
[[0.691, 'Frequent_physical_distress'],
 [0.69, 'Frequent_mental_distress'],
 [0.689, 'Poor_or_fair_health'],
 [0.683, 'Adult_smoking'],
 [0.675, 'Poor_physical_health_days'],
 [0.665, 'Diabetes_prevalence'],
 [0.66, 'Children_in_poverty'],
 [0.655, 'Teen_births'],
 [0.64, 'Poor_mental_health_days'],
 [0.64, 'Median_household_income'],
 [0.637, 'Physical_inactivity'],
 [0.595, 'Food_insecurity'],
 [0.582, 'Excessive_drinking'],
 [0.567, 'Insufficient_sleep'],
 [0.554, 'Median_household_income_White'],
 [0.542, 'Low_birthweight'],
 [0.539, 'Children_in_poverty_White'],
 [0.526, 'Children_eligible_for_free_or_reduced_price_lunch'],
 [0.498, 'Some_college'],
 [0.487, 'Children_in_single_parent_households']]
In [13]:
# Top p-values
top_n = 20
top_predictors = list_predictors[:top_n]
print(f'Top {top_n} strongest linear correlation with Life Expectancy:')
display(top_predictors)
Top 20 strongest linear correlation with Life Expectancy:
['Frequent_physical_distress',
 'Teen_births',
 'Mammography_screening',
 'Adult_smoking',
 'Long_commute_driving_alone',
 'Driving_alone_to_work',
 'Preventable_hospital_stays',
 'Mental_health_providers',
 'Physical_inactivity',
 'Poor_physical_health_days',
 'High_school_graduation',
 'Dentists',
 'Low_birthweight',
 'Other_primary_care_providers',
 'Diabetes_prevalence',
 'Violent_crime',
 'Median_household_income',
 'Median_household_income_White',
 'Frequent_mental_distress',
 'Food_insecurity']

4.5 Evaluate predictors (Step 3) : Multicollinearity between predictors

In [14]:
# Check correlation between top columns - to avoid multicollinearity
combo1 = list(combinations(top_predictors, 2))
corr_list1 = []
for comb in combo1:
    corr = stats.pearsonr(dftrain[comb[0]], dftrain[comb[1]])
    corr_list1.append([abs(round(corr[0],3)), comb[0], comb[1]])  
    
# Show sorted list of columns and correlations between life expectancy vs. column and log(column)
corr_list1 = sorted(corr_list1, key=lambda i: i[0], reverse=True)
print('Correlation between predictors to avoid multicollinearity (0.8 cut-off):')
display(list(filter(lambda i: i[0] > 0.8, corr_list1)))
Correlation between predictors to avoid multicollinearity (0.8 cut-off):
[[0.985, 'Frequent_physical_distress', 'Poor_physical_health_days'],
 [0.953, 'Frequent_physical_distress', 'Frequent_mental_distress'],
 [0.949, 'Poor_physical_health_days', 'Frequent_mental_distress'],
 [0.847, 'Median_household_income', 'Median_household_income_White']]
In [15]:
# Drop top predictors based on multicollinearity etc:
drop_list = ['Frequent_physical_distress', 'Poor_physical_health_days', 'Median_household_income_White']
top_predictors = [i for i in top_predictors if i not in drop_list]
len(top_predictors)
Out[15]:
17
In [16]:
# Validate list via correlation heatmap between top predictors after drop:
sns.heatmap(abs(df[top_predictors].corr()), cmap="PiYG", center=0)
plt.show()

4.6 Model 1 : evaluate using top predictors

In [17]:
# Cross validation using top predictors only
x_train = x_train[top_predictors[:]]

regression = LinearRegression()
crossvalidation = KFold(n_splits=5, shuffle=True, random_state=seed)
model1_rsq = np.mean(cross_val_score(regression, x_train, y_train, scoring='r2', cv=crossvalidation))
model1_mse = -np.mean(cross_val_score(regression, x_train, y_train, cv=5, scoring='neg_mean_squared_error'))
print(f'No of predictors: {x_train.shape[1]}')
print(f'kfold cv r-sq: {round(model1_rsq,4)}')
print(f'kfold cv MSE : {round(model1_mse,4)}')
No of predictors: 17
kfold cv r-sq: 0.6974
kfold cv MSE : 1.9651

4.7 Evaluate predictors (Step 4) : Interaction between top predictors

In [18]:
# Check interactions between pairs of top predictors
regression = LinearRegression()
crossvalidation = KFold(n_splits=5, shuffle=True, random_state=seed)
combo2 = list(combinations(x_train.columns, 2))
interactions = []
data = pd.DataFrame(scale(x_train), columns=x_train.columns)
for comb in combo2:
    data['interaction'] = data[comb[0]] * data[comb[1]]
    score = np.mean(cross_val_score(regression, data, y_train, scoring='r2', cv=crossvalidation))
    if score > model1_rsq: interactions.append((round(score, 3), comb[0], comb[1]))
interactions = sorted(interactions, key=lambda i: i[0], reverse=True)
k = 20
print(f'Max value of r2: {max(interactions)[0]}')
print(f'Min value of r2: {min(interactions)[0]}')
print(f'Top {k} Interaction cross validation r2 between predictors:')
display(interactions[:k])
Max value of r2: 0.702
Min value of r2: 0.697
Top 20 Interaction cross validation r2 between predictors:
[(0.702, 'Teen_births', 'Adult_smoking'),
 (0.702, 'Teen_births', 'Physical_inactivity'),
 (0.701, 'Teen_births', 'Preventable_hospital_stays'),
 (0.701, 'Teen_births', 'Frequent_mental_distress'),
 (0.7, 'Teen_births', 'Low_birthweight'),
 (0.7, 'Teen_births', 'Diabetes_prevalence'),
 (0.7, 'Teen_births', 'Food_insecurity'),
 (0.7, 'Long_commute_driving_alone', 'High_school_graduation'),
 (0.7, 'Mental_health_providers', 'Dentists'),
 (0.699, 'Teen_births', 'Mammography_screening'),
 (0.699, 'Teen_births', 'Violent_crime'),
 (0.699, 'Mammography_screening', 'Preventable_hospital_stays'),
 (0.699, 'Mammography_screening', 'Physical_inactivity'),
 (0.699, 'Mammography_screening', 'Diabetes_prevalence'),
 (0.699, 'Mammography_screening', 'Frequent_mental_distress'),
 (0.699, 'Mammography_screening', 'Food_insecurity'),
 (0.699, 'Adult_smoking', 'Preventable_hospital_stays'),
 (0.699, 'Adult_smoking', 'Low_birthweight'),
 (0.699, 'Adult_smoking', 'Other_primary_care_providers'),
 (0.699, 'Long_commute_driving_alone', 'Driving_alone_to_work')]

4.8 Evaluate predictors (Step 5): Polynomial terms

In [19]:
# Explore polynomial terms
degrees = [2, 3]
polynomials = []
for col in x_train.columns:
    for degree in degrees:
        data = x_train.copy()
        poly = PolynomialFeatures(degree, include_bias=False)
        x_transformed = poly.fit_transform(x_train[[col]])
        data = pd.concat([data.drop(col, axis=1),pd.DataFrame(x_transformed)], axis=1)
        score = np.mean(cross_val_score(regression, data, y_train, scoring='r2', cv=crossvalidation))
        polynomials.append((round(score, 3), col, degree))
polynomials = sorted(polynomials, key=lambda i: i[0], reverse=True)
k=20
print(f'Top {k} r2 of polynomial terms for top predictors for degree={degrees}')
display(polynomials[:20])
# Show Top 20:
Top 20 r2 of polynomial terms for top predictors for degree=[2, 3]
[(0.701, 'Teen_births', 2),
 (0.701, 'Teen_births', 3),
 (0.698, 'Adult_smoking', 2),
 (0.698, 'Adult_smoking', 3),
 (0.698, 'Mental_health_providers', 3),
 (0.698, 'Physical_inactivity', 2),
 (0.698, 'Physical_inactivity', 3),
 (0.698, 'Dentists', 2),
 (0.698, 'Violent_crime', 2),
 (0.698, 'Frequent_mental_distress', 3),
 (0.697, 'Mammography_screening', 2),
 (0.697, 'Mammography_screening', 3),
 (0.697, 'Long_commute_driving_alone', 2),
 (0.697, 'Long_commute_driving_alone', 3),
 (0.697, 'Mental_health_providers', 2),
 (0.697, 'High_school_graduation', 2),
 (0.697, 'High_school_graduation', 3),
 (0.697, 'Dentists', 3),
 (0.697, 'Low_birthweight', 2),
 (0.697, 'Low_birthweight', 3)]

4.9 Add top interaction terms and top polynomial terms into data frame

In [20]:
# Polynomial terms: select columns with highest cv r2
poly_columns = ['Teen_births', 'Adult_smoking', 'Physical_inactivity']

# Interaction terms: select pairs of columns with highest cv r2
inter_columns = [['Physical_inactivity', 'Adult_smoking']]

# Build new df with Polynomial terms
dfpoly = pd.DataFrame()
for col in poly_columns:
    poly = PolynomialFeatures(3, include_bias=False)
    x_transformed = poly.fit_transform(x_train[[col]])
    colnames= [col, col + '_' + 'power2',  col + '_' + 'power3']
    dfpoly = (pd.concat([dfpoly, pd.DataFrame(x_transformed, columns=colnames)], axis=1)).drop([col], axis=1)

# Build new df with Interaction terms
dfinter = pd.DataFrame()
for col in inter_columns:
    colname = f'{col[0]}_AND_{col[1]}'
    dfinter[colname] = x_train[col[0]] * x_train[col[1]]
    
# Merge with all predictors
df_shortlist = pd.concat([y_train, x_train, dfpoly, dfinter], axis=1)
x_shortlist = df_shortlist.drop('Life_expectancy', axis=1)
x_shortlist.columns
Out[20]:
Index(['Teen_births', 'Mammography_screening', 'Adult_smoking',
       'Long_commute_driving_alone', 'Driving_alone_to_work',
       'Preventable_hospital_stays', 'Mental_health_providers',
       'Physical_inactivity', 'High_school_graduation', 'Dentists',
       'Low_birthweight', 'Other_primary_care_providers',
       'Diabetes_prevalence', 'Violent_crime', 'Median_household_income',
       'Frequent_mental_distress', 'Food_insecurity', 'Teen_births_power2',
       'Teen_births_power3', 'Adult_smoking_power2', 'Adult_smoking_power3',
       'Physical_inactivity_power2', 'Physical_inactivity_power3',
       'Physical_inactivity_AND_Adult_smoking'],
      dtype='object')

5.1 Model 2 : use top predictors + top interactions + top polynomial terms

In [21]:
# Calculate new baseline with shortlisted columns, interactions and polynomial terms
x_train = x_shortlist
regression = LinearRegression()
crossvalidation = KFold(n_splits=5, shuffle=True, random_state=seed)
model2_rsq = np.mean(cross_val_score(regression, x_train, y_train, scoring='r2', cv=crossvalidation))
model2_mse = -np.mean(cross_val_score(regression, x_train, y_train, cv=5, scoring='neg_mean_squared_error'))
print(f'No of predictors: {x_train.shape[1]}')
print(f'kfold cv r-sq: {round(model2_rsq,4)}')
print(f'kfold cv MSE : {round(model2_mse,4)}')
No of predictors: 24
kfold cv r-sq: 0.702
kfold cv MSE : 1.9404

5.2 Determine strongest predictor terms (based on correlation)

In [22]:
# Calculate correlation coeff with Life Expectancy:
cols = list(df_shortlist.iloc[:, 1:].columns) # skip Life expectancy
corr_list = []
top_predictors = []

for col in cols:    
    # Correlation with x    
    corr = stats.pearsonr(df_shortlist[col], df_shortlist.Life_expectancy)    
    # Correlation with log x
    transformer = Normalizer().fit([df_shortlist[col]])
    t = transformer.transform([df_shortlist[col]])   
    if df_shortlist[col].min() <= 0:
        corrlog = 0
    else:
        corrlog = stats.pearsonr(np.log(t[0]), df_shortlist.Life_expectancy)[0]
    # Output list
    corr_list.append([abs(round(corr[0],3)), col])
    top_predictors.append(col)

# Show sorted list of columns and correlations between life expectancy vs. column and log(column)
corr_list = sorted(corr_list, key=lambda i: i[0], reverse=True)
print(f'Sorted correlation between all predictors vs. Life expectancy:')
display(corr_list[:])
Sorted correlation between all predictors vs. Life expectancy:
[[0.69, 'Frequent_mental_distress'],
 [0.683, 'Adult_smoking'],
 [0.665, 'Diabetes_prevalence'],
 [0.655, 'Teen_births'],
 [0.64, 'Median_household_income'],
 [0.637, 'Physical_inactivity'],
 [0.595, 'Food_insecurity'],
 [0.542, 'Low_birthweight'],
 [0.466, 'Preventable_hospital_stays'],
 [0.398, 'Physical_inactivity_power3'],
 [0.378, 'Mammography_screening'],
 [0.369, 'Driving_alone_to_work'],
 [0.349, 'Teen_births_power3'],
 [0.327, 'Adult_smoking_power3'],
 [0.297, 'Dentists'],
 [0.25, 'Violent_crime'],
 [0.215, 'Long_commute_driving_alone'],
 [0.206, 'Teen_births_power2'],
 [0.147, 'Physical_inactivity_power2'],
 [0.119, 'Mental_health_providers'],
 [0.071, 'Physical_inactivity_AND_Adult_smoking'],
 [0.047, 'Adult_smoking_power2'],
 [0.045, 'High_school_graduation'],
 [0.026, 'Other_primary_care_providers']]

5.3 Determine strongest predictor terms (based on standardized coefficient)

In [23]:
# Calculate coefficients
linreg = LinearRegression()
linreg.fit(x_train, y_train)
coeff = linreg.coef_
coeff_list = []
for n, col in enumerate(cols):
    coeff_list.append([abs(round(coeff[0][n],3)), 'Negative' if coeff[0][n] < 0 else 'Positive', col])
coeff_list = sorted(coeff_list, key=lambda i: i[0], reverse=True)
coeff_list
Out[23]:
[[0.58, 'Negative', 'Teen_births'],
 [0.559, 'Negative', 'Adult_smoking'],
 [0.386, 'Negative', 'Diabetes_prevalence'],
 [0.323, 'Negative', 'Food_insecurity'],
 [0.291, 'Positive', 'Median_household_income'],
 [0.272, 'Negative', 'Driving_alone_to_work'],
 [0.262, 'Negative', 'Mental_health_providers'],
 [0.259, 'Negative', 'Physical_inactivity'],
 [0.25, 'Positive', 'Mammography_screening'],
 [0.237, 'Negative', 'Long_commute_driving_alone'],
 [0.165, 'Negative', 'Preventable_hospital_stays'],
 [0.156, 'Negative', 'Other_primary_care_providers'],
 [0.152, 'Negative', 'Teen_births_power2'],
 [0.135, 'Negative', 'High_school_graduation'],
 [0.113, 'Positive', 'Dentists'],
 [0.092, 'Negative', 'Violent_crime'],
 [0.062, 'Negative', 'Low_birthweight'],
 [0.058, 'Positive', 'Physical_inactivity_AND_Adult_smoking'],
 [0.03, 'Positive', 'Frequent_mental_distress'],
 [0.029, 'Positive', 'Adult_smoking_power3'],
 [0.019, 'Positive', 'Physical_inactivity_power3'],
 [0.016, 'Negative', 'Teen_births_power3'],
 [0.015, 'Positive', 'Adult_smoking_power2'],
 [0.014, 'Negative', 'Physical_inactivity_power2']]

5.4 Evaluate linear regression model assumptions via residual analysis

In [24]:
# Investigate residuals
dfplot = pd.concat([y_train, x_train], axis=1)
outcome = 'Life_expectancy'
x_cols = list(x_train.columns)
predictors = '+'.join(x_cols)
formula = outcome + '~' + predictors
model2 = ols(formula=formula, data=dfplot).fit()
summary = model2.summary()
fig, axes = plt.subplots(nrows=1, ncols=2, figsize=(12,5))

# Residual plot: check that residuals are non-deterministic by checking that
# there are no discernable patterns from the plot
axes[0].scatter(model2.predict(x_train), model2.resid, color='blue', alpha=0.3)
axes[0].axhline(y=0, color='black')
axes[0].set_xlim(72,82)
axes[0].set_ylim(-5,5)
axes[0].set_xlabel('Fitted value')
axes[0].set_title('Homoscedasticity of residuals check: \n Scatter plot', fontsize=14)

# Q-Q plot: check that residuals are normally distributed by checking that
# residuals lie within the central line
fig = sm.graphics.qqplot(model2.resid, dist=stats.norm, line='45', fit=True, ax=axes[1])
axes[1].axhline(y=0, color='black')
axes[1].set_xlim(-4,4)
axes[1].set_ylim(-4,4)
axes[1].set_title('Normality of residuals check: \n Q-Q plot', fontsize=14)
plt.show()
In [25]:
# Conclusions: Residuals do not have deterministic characteristics (random) and thus homoscedastic.
# Also, the residuals are largely normally distributed based on Q-Q plot although the tail ends could be
# improved further despite the fact that outliers have been largely removed during data cleansing.

6. Final model

6.1 Prepare final training and test data

In [26]:
# Assign training data to the final choice of predictors
x_train = x_shortlist

# Test data still has the same number of original columns.
# Now reshape test data columns to match the final set of columns in train data:
x_test_raw = x_test
final_columns = list(x_train.columns)
df_test = pd.DataFrame()
for col in final_columns:
    # Create interaction terms
    if col.find('_AND_') != -1:
        c = 1
        inter1 = col[0:col.find('_AND_')]
        inter2 = col[(-1)*(len(col)-5-len(inter1)):]
        c = [inter1,inter2]
        df_test[f'{inter1}_AND_{inter2}'] = x_test_raw[inter1] * x_test_raw[inter2]
    # Create polynomial terms
    elif col.find('_power') != -1:
        polycol = col[:len(col)-7]
        power = int(col[-1:])
        c = [polycol, power]
        df_test[f'{polycol}_power{power}'] = x_test_raw[polycol]      
    # Original terms
    else:
        c = col
        df_test[col] = x_test_raw[col]

# Assign test data (after processing)
x_test = pd.DataFrame(df_test, columns=df_test.columns) # scale training data
y_test = (pd.DataFrame(y_test)).reset_index(drop=True)

# Double check data dimensions are the comparable
print(f'x_train:{x_train.shape}, y_train:{y_train.shape}, x_test:{x_test.shape}, y_test:{y_test.shape}')
x_train:(1719, 24), y_train:(1719, 1), x_test:(1000, 24), y_test:(1000, 1)

6.2 Run final model with training and test data

In [27]:
# Run model with training and test data

linreg = LinearRegression()
linreg.fit(x_train, y_train)

pred_train = linreg.predict(x_train)
pred_test  = linreg.predict(x_test)

final_train_rsq = linreg.score(x_train, y_train)
final_test_rsq  = linreg.score(x_test, y_test)
final_train_mse = mean_squared_error(y_train, pred_train)
final_test_mse  = mean_squared_error(y_test, pred_test)

print('Summary of r^2 improvements')
print('---------------------------')
print('1. Baseline                      :', round(baseline_rsq,4))
print('2. Baseline with Lasso           :', round(lasso_rsq,4))
print('3. Model 1 using strongest corr  :', round(model1_rsq,4))
print('4. Model 2 incl poly & interacts :', round(model2_rsq,4))
print('5. Model 3 (final) with Train set:', round(final_train_rsq,4))
print('6. Model 3 (final) with Test set :', round(final_test_rsq,4))
print('')
print('Summary of MSE improvements')
print('---------------------------')
print('1. Baseline                      :', round(baseline_mse,4))
print('2. Baseline with Lasso           :', round(lasso_mse,4))
print('3. Model 1 using strongest corr  :', round(model1_mse,4))
print('4. Model 2 incl poly & interacts :', round(model2_mse,4))
print('5. Model 3 (final) with Train set:', round(final_train_mse,4))
print('6. Model 3 (final) with Test set :', round(final_test_mse,4))
Summary of r^2 improvements
---------------------------
1. Baseline                      : 0.6992
2. Baseline with Lasso           : 0.7222
3. Model 1 using strongest corr  : 0.6974
4. Model 2 incl poly & interacts : 0.702
5. Model 3 (final) with Train set: 0.7127
6. Model 3 (final) with Test set : 0.6642

Summary of MSE improvements
---------------------------
1. Baseline                      : 1.9451
2. Baseline with Lasso           : 1.798
3. Model 1 using strongest corr  : 1.9651
4. Model 2 incl poly & interacts : 1.9404
5. Model 3 (final) with Train set: 1.86
6. Model 3 (final) with Test set : 2.2046
In [28]:
fig, axes = plt.subplots(nrows=1, ncols=2, figsize=(12,5))
axes[0].scatter(y_train, pred_train, label='Model', color='blue', alpha=0.3)
axes[0].plot(y_train, y_train, label='TRAIN data', color='blue')
axes[1].scatter(y_test, pred_test, label='Model', color='red', alpha=0.3)
axes[1].plot(y_test, y_test, label='TEST data', color='red')
axes[0].set_xlim(65,90)
axes[0].set_ylim(65,90)
axes[1].set_xlim(65,90)
axes[1].set_ylim(65,90)
axes[0].set_ylabel('Life expectancy')
axes[0].legend(loc='upper left')
axes[1].legend(loc='upper left')
axes[0].set_title(f'TRAIN data vs. Model \n sample n={x_train.shape[0]}, MSE={round(final_train_mse,2)}, r^2={round(final_train_rsq,2)}', fontsize=15)
axes[1].set_title(f'TEST data vs. Model \n sample n={x_test.shape[0]}, MSE={round(final_test_mse,2)}, r^2={round(final_test_rsq,2)}', fontsize=15)
plt.legend()
plt.show()

7. Conclusions

From the study and models conducted on the impact of various health and lifestyle factors to life expectancy in the US, we came up with following key conclusions:

  • Teen births,Cancer,STIs smoking and diabetes prevalence are identified as top contributors to lower life expectancy.